Zrm(list=ls(all=T))
Sys.setlocale("LC_TIME","C")## [1] "C"
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr)Z = read_csv("data/ta_feng_all_months_merged.csv") %>% data.frame %>%
setNames(c("date","cust","age","area","cat","prod","qty","cost","price"))## Parsed with column specification:
## cols(
## TRANSACTION_DT = col_character(),
## CUSTOMER_ID = col_character(),
## AGE_GROUP = col_character(),
## PIN_CODE = col_character(),
## PRODUCT_SUBCLASS = col_double(),
## PRODUCT_ID = col_character(),
## AMOUNT = col_double(),
## ASSET = col_double(),
## SALES_PRICE = col_double()
## )
nrow(Z)## [1] 817741
Z$date = as.Date(Z$date, format="%m/%d/%Y")
Z$age[is.na(Z$age)] = "na"
Z$age = factor(Z$age, levels=c(
"<25","25-29","30-34","35-39","40-44","45-49","50-54","55-59","60-64",">65","na"), labels=c(
"a20","a25","a30","a35","a40","a45","a50","a55","a60","a65","na"
)) %>% as.character
Z$area = paste0("z",Z$area)
summary(Z)## date cust age
## Min. :2000-11-01 Length:817741 Length:817741
## 1st Qu.:2000-11-28 Class :character Class :character
## Median :2001-01-01 Mode :character Mode :character
## Mean :2000-12-30
## 3rd Qu.:2001-01-30
## Max. :2001-02-28
## area cat prod qty
## Length:817741 Min. :100101 Length:817741 Min. : 1.000
## Class :character 1st Qu.:110106 Class :character 1st Qu.: 1.000
## Mode :character Median :130106 Mode :character Median : 1.000
## Mean :284950 Mean : 1.382
## 3rd Qu.:520314 3rd Qu.: 1.000
## Max. :780510 Max. :1200.000
## cost price
## Min. : 0.0 Min. : 1.0
## 1st Qu.: 35.0 1st Qu.: 42.0
## Median : 62.0 Median : 76.0
## Mean : 112.1 Mean : 131.9
## 3rd Qu.: 112.0 3rd Qu.: 132.0
## Max. :432000.0 Max. :444000.0
# Quantile of Variables
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))## qty cost price
## 99% 6 858.0 1014.00
## 99.9% 14 2722.0 3135.82
## 99.95% 24 3799.3 3999.00
# Remove Outliers
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000)
nrow(Z) ## [1] 817182
Z$tid = group_indices(Z, date, cust) # same customer same day# No. cust, cat, prod, tid
sapply(Z[c("cust","cat","prod","tid")], n_distinct)## cust cat prod tid
## 32256 2007 23789 119422
# Summary of Item Records
summary(Z)## date cust age
## Min. :2000-11-01 Length:817182 Length:817182
## 1st Qu.:2000-11-28 Class :character Class :character
## Median :2001-01-01 Mode :character Mode :character
## Mean :2000-12-30
## 3rd Qu.:2001-01-30
## Max. :2001-02-28
## area cat prod qty
## Length:817182 Min. :100101 Length:817182 Min. : 1.000
## Class :character 1st Qu.:110106 Class :character 1st Qu.: 1.000
## Mode :character Median :130106 Mode :character Median : 1.000
## Mean :284784 Mean : 1.358
## 3rd Qu.:520311 3rd Qu.: 1.000
## Max. :780510 Max. :24.000
## cost price tid
## Min. : 0.0 Min. : 1.0 Min. : 1
## 1st Qu.: 35.0 1st Qu.: 42.0 1st Qu.: 28783
## Median : 62.0 Median : 76.0 Median : 59391
## Mean : 106.2 Mean : 125.5 Mean : 58845
## 3rd Qu.: 112.0 3rd Qu.: 132.0 3rd Qu.: 87391
## Max. :3798.0 Max. :4000.0 Max. :119422
XX = Z %>% group_by(tid) %>% summarise(
date = date[1], # 交易日期
cust = cust[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(X) # 119422 ## [1] 119422
summary(X) ## tid date cust
## Min. : 1 Min. :2000-11-01 Length:119422
## 1st Qu.: 29856 1st Qu.:2000-11-29 Class :character
## Median : 59712 Median :2001-01-01 Mode :character
## Mean : 59712 Mean :2000-12-31
## 3rd Qu.: 89567 3rd Qu.:2001-02-02
## Max. :119422 Max. :2001-02-28
## age area items pieces
## Length:119422 Length:119422 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 2.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 5.000 Median : 6.000
## Mean : 6.843 Mean : 9.294
## 3rd Qu.: 9.000 3rd Qu.: 12.000
## Max. :112.000 Max. :339.000
## total gross
## Min. : 5 Min. :-1645.0
## 1st Qu.: 227 1st Qu.: 21.0
## Median : 510 Median : 68.0
## Mean : 859 Mean : 132.3
## 3rd Qu.: 1082 3rd Qu.: 169.0
## Max. :30171 Max. : 8069.0
# Check Quantile & Remove Outliers
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))## items pieces total gross
## 99.9% 54 81.0000 9009.579 1824.737
## 99.95% 62 94.2895 10611.579 2179.817
## 99.99% 82 133.0000 16044.401 3226.548
# Remove Outliers
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328par(cex=0.8)
hist(X$date, "weeks", freq=T, las=2, main="No. Transaction per Week")Ad0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% data.frame # 33241
nrow(A)## [1] 32241
summary(A) ## cust r s f
## Length:32241 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.: 9.00 1st Qu.: 56.00 1st Qu.: 1.000
## Mode :character Median : 26.00 Median : 92.00 Median : 2.000
## Mean : 37.45 Mean : 80.78 Mean : 3.701
## 3rd Qu.: 60.00 3rd Qu.:110.00 3rd Qu.: 4.000
## Max. :120.00 Max. :120.00 Max. :85.000
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -784.0 Length:32241
## 1st Qu.: 365.0 1st Qu.: 707 1st Qu.: 75.0 Class :character
## Median : 705.7 Median : 1750 Median : 241.0 Mode :character
## Mean : 993.1 Mean : 3152 Mean : 484.6
## 3rd Qu.: 1291.0 3rd Qu.: 3968 3rd Qu.: 612.0
## Max. :12636.0 Max. :127686 Max. :20273.0
## area
## Length:32241
## Class :character
## Mode :character
##
##
##
#做出RF分佈圖
# 切割頻率
orders.segm <- A %>%
mutate(buy_freq=ifelse(between(f, 1, 10), '1',
ifelse(between(f, 11, 20), '2',
ifelse(between(f, 21, 30), '3',
ifelse(between(f, 31, 40), '4',
ifelse(between(f, 41, 50), '5', '>5')))))) %>%
# 切割近因畫出邊界
mutate(segm.rec=ifelse(between(r, 0, 7), '0-7 days',
ifelse(between(r, 8, 15), '8-15 days',
ifelse(between(r, 16, 22), '16-22 days',
ifelse(between(r, 23, 30), '23-30 days',
ifelse(between(r, 31, 55), '31-55 days', '>55 days')))))) %>%
arrange(A$cust)
# 定義邊界的順序
orders.segm$buy_freq <- factor(orders.segm$buy_freq, levels=c('>5', '5', '4', '3', '2', '1'))
orders.segm$segm.rec <- factor(orders.segm$segm.rec, levels=c('>55 days', '31-55 days', '23-30 days', '16-22 days', '8-15 days', '0-7 days'))
lcg <- orders.segm %>%
group_by(segm.rec, buy_freq) %>%
summarise(quantity=n()) %>%
mutate(client='customers') %>%
ungroup()
lcg.matrix= as.data.frame.matrix(table(orders.segm$buy_freq, orders.segm$segm.rec))
lcg.matrix$buy_freq = row.names(lcg.matrix)
lcg.matrix## >55 days 31-55 days 23-30 days 16-22 days 8-15 days 0-7 days buy_freq
## >5 0 0 0 0 0 38 >5
## 5 0 0 0 0 0 46 5
## 4 0 0 0 0 9 68 4
## 3 0 2 2 5 40 254 3
## 2 15 31 34 69 372 935 2
## 1 8639 6223 2530 2817 4418 5694 1
# 繪製RFM分析圖
lcg.adv <- lcg %>%
mutate(rec.type = ifelse(segm.rec %in% c(">55 days", "31-55 days", "23-30 days"), "not recent", "recent"),
freq.type = ifelse(buy_freq %in% c(">5", "5", "4"), "frequent", "infrequent"),
customer.type = interaction(rec.type, freq.type))
ggplot(lcg.adv, aes(x=client, y=quantity, fill=customer.type)) +
theme_bw() +
geom_rect(aes(fill = customer.type), xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf, alpha = 0.1) +
facet_grid(buy_freq ~ segm.rec) +
geom_bar(stat='identity', alpha=0.7) +
geom_text(aes(y=max(quantity)/2, label=quantity), size=4) +
ggtitle("R&F Analysis Graphics") +
xlab("Days to last time purchase") + ylab("Purchase Frequency")+
guides(fill=guide_legend(title="Group color"))+
scale_fill_discrete(name="Experimental\nCondition",breaks = c('not recent.frequent','recent.frequent','not recent.infrequent','recent.infrequent'), labels = c('Former Customers','Frequent-coming Customers','One-time Customers','New Customers'))lcg.sub <- orders.segm %>%
group_by(segm.rec, buy_freq) %>%
summarise(quantity=n()) %>%
mutate(client='customers') %>%
ungroup()#細部切割10個客群
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
set.seed(111)
A$grp = kmeans(scale(A[,c(2:5)]),10)$cluster
table(A$grp) # 族群大小##
## 1 2 3 4 5 6 7 8 9 10
## 3247 1653 5239 337 2381 4706 1145 6470 2788 4275
#RFM分群泡泡圖
group_by(A, grp) %>% summarise(
recent=mean(r),
freq=mean(f),
money=mean(m),
size=n() ) %>%
mutate( revenue = size*money/1000 ) %>%
filter(size > 1) %>%
ggplot(aes(x=freq, y=money)) +
geom_point(aes(size=revenue, col=recent),alpha=0.5) +
scale_size(range=c(4,30)) +
scale_color_gradient(low="green",high="red") +
scale_x_log10() + scale_y_log10(limits=c(30,5000)) +
geom_text(aes(label = size ),size=3) +
theme_bw() + guides(size=F) +
labs(title="Customer Segements",
subtitle="(bubble_size:revenue_contribution; text:group_size)",
color="Recency") +
xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")ggplotly()#分佈情形
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m'))
hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)#定位分群
STS = c("菜雞","積優股","冤大頭","提款機","沈思者","大雄","Almost葛屁")
Status = function(rx,fx,mx,sx,K) {factor(
ifelse(sx < 3*K,
ifelse(fx*mx > 1600, "積優股", "菜雞"),
ifelse(rx < 2*K,
ifelse(sx/fx < 0.41*K,"提款機","冤大頭"),
ifelse(rx < 4*K,"沈思者",
ifelse(rx < 5*K,"大雄","Almost葛屁")))), STS)}K = as.integer(sum(A$s[A$f>1]) / sum(A$f[A$f>1])); K## [1] 17
is.na(Z) %>% colSums## date cust age area cat prod qty cost price tid
## 0 0 0 0 0 0 0 0 0 0
is.na(X) %>% colSums## tid date cust age area items pieces total gross
## 0 0 0 0 0 0 0 0 0
is.na(A) %>% colSums## cust r s f m rev raw age area grp
## 0 0 0 0 0 0 0 0 0 0
A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")Sys.setlocale("LC_TIME","C")## [1] "C"
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr)
load("data/tf0.rdata")Remove data after the demarcation date
feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01) # 618212X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387summary(X)## tid date cust
## Min. : 1 Min. :2000-11-01 Length:88387
## 1st Qu.:22098 1st Qu.:2000-11-23 Class :character
## Median :44194 Median :2000-12-12 Mode :character
## Mean :44194 Mean :2000-12-15
## 3rd Qu.:66290 3rd Qu.:2001-01-12
## Max. :88387 Max. :2001-01-31
## age area items pieces
## Length:88387 Length:88387 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 2.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 5.000 Median : 6.000
## Mean : 6.994 Mean : 9.453
## 3rd Qu.: 9.000 3rd Qu.: 12.000
## Max. :112.000 Max. :339.000
## total gross
## Min. : 5.0 Min. :-1645.0
## 1st Qu.: 230.0 1st Qu.: 23.0
## Median : 522.0 Median : 72.0
## Mean : 888.7 Mean : 138.3
## 3rd Qu.: 1120.0 3rd Qu.: 174.0
## Max. :30171.0 Max. : 8069.0
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))## items pieces total gross
## 99.9% 56.0000 84.0000 9378.684 1883.228
## 99.95% 64.0000 98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295d0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
Status = Status(r,f,m,s,K)
) %>% data.frame # 28584
nrow(A)## [1] 28584
feb = filter(X0, date>= feb01) %>% group_by(cust) %>%
summarise(amount = sum(total)) # 16899A$amountSimply a Left Joint
A = merge(A, feb, by="cust", all.x=T)A$buyA$buy = !is.na(A$amount)summary(A)## cust r s f
## Length:28584 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Mode :character Median :21.00 Median :68.00 Median : 2.000
## Mean :32.12 Mean :61.27 Mean : 3.089
## 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :92.00 Max. :92.00 Max. :60.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -742.0 Length:28584
## 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 Class :character
## Median : 709.5 Median : 1566 Median : 218.0 Mode :character
## Mean : 1012.4 Mean : 2711 Mean : 420.8
## 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0
## Max. :10634.0 Max. :99597 Max. :15565.0
##
## area Status amount buy
## Length:28584 菜雞 : 5698 Min. : 8 Mode :logical
## Class :character 積優股 : 2489 1st Qu.: 454 FALSE:15342
## Mode :character 冤大頭 :10048 Median : 993 TRUE :13242
## 提款機 : 822 Mean : 1499
## 沈思者 : 5640 3rd Qu.: 1955
## 大雄 : 3005 Max. :28089
## Almost葛屁: 882 NA's :15342
tapply(A$buy, A$age, mean) %>% barplot
abline(h = mean(A$buy), col='red')tapply(A$buy, A$area, mean) %>% barplot(las=2)
abline(h = mean(A$buy), col='red')X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))## [1] 28584 20008 8576
cbind(A, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))## [1] 13242 9269 3973
cbind(A2, spl2) %>%
ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)save(Z, X, A, spl, spl2, file="data/tf2.rdata")TR = subset(A, spl)
TS = subset(A, !spl)glm1 = glm(buy ~ ., TR[,c(2:8,10,12)], family=binomial())
summary(glm1)##
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:8,
## 10, 12)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5766 -0.8455 -0.7286 1.0460 1.9211
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.160e+00 9.124e-02 -12.712 < 2e-16 ***
## r -1.289e-02 1.819e-03 -7.087 1.38e-12 ***
## s 7.528e-03 1.578e-03 4.770 1.84e-06 ***
## f 3.257e-01 1.735e-02 18.770 < 2e-16 ***
## m -5.491e-05 3.011e-05 -1.824 0.06820 .
## rev 3.900e-05 1.942e-05 2.009 0.04455 *
## raw -2.251e-04 8.488e-05 -2.652 0.00801 **
## agea25 -4.958e-02 8.684e-02 -0.571 0.56802
## agea30 -8.203e-03 7.996e-02 -0.103 0.91829
## agea35 4.574e-02 7.921e-02 0.577 0.56369
## agea40 5.508e-02 8.131e-02 0.677 0.49816
## agea45 -9.187e-04 8.463e-02 -0.011 0.99134
## agea50 5.231e-03 9.325e-02 0.056 0.95527
## agea55 1.597e-01 1.093e-01 1.460 0.14419
## agea60 5.839e-02 1.176e-01 0.497 0.61937
## agea65 2.504e-01 1.046e-01 2.394 0.01669 *
## agena -2.537e-01 1.459e-01 -1.738 0.08213 .
## Status積優股 1.005e-01 7.061e-02 1.424 0.15454
## Status冤大頭 1.081e-01 8.715e-02 1.240 0.21489
## Status提款機 -1.290e+00 2.725e-01 -4.735 2.19e-06 ***
## Status沈思者 2.423e-01 8.257e-02 2.934 0.00334 **
## Status大雄 1.554e-01 1.126e-01 1.380 0.16767
## StatusAlmost葛屁 6.434e-02 1.559e-01 0.413 0.67979
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27629 on 20007 degrees of freedom
## Residual deviance: 23359 on 19985 degrees of freedom
## AIC: 23405
##
## Number of Fisher Scoring iterations: 6
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm## predict
## actual FALSE TRUE
## FALSE 3755 848
## TRUE 1684 2289
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.69998## [1] 0.7047575
colAUC(pred, TS$buy) # 0.7556## [,1]
## FALSE vs. TRUE 0.7518547
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)lm1 = lm(amount ~ ., TR2[,c(2:6,8,9,11)])
summary(lm1)##
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8, 9, 11)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83304 -0.22812 0.04853 0.28096 1.64243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1403704 0.0504979 22.583 < 2e-16 ***
## r 0.0000702 0.0003090 0.227 0.82030
## s 0.0001173 0.0003123 0.376 0.70721
## f 0.0256836 0.0017965 14.296 < 2e-16 ***
## m 0.5045943 0.0372711 13.539 < 2e-16 ***
## rev 0.0450307 0.0360945 1.248 0.21222
## agea25 0.0737926 0.0251165 2.938 0.00331 **
## agea30 0.1204660 0.0230651 5.223 1.80e-07 ***
## agea35 0.1264592 0.0227496 5.559 2.79e-08 ***
## agea40 0.1382214 0.0232522 5.944 2.87e-09 ***
## agea45 0.1085828 0.0242698 4.474 7.77e-06 ***
## agea50 0.0787808 0.0264917 2.974 0.00295 **
## agea55 0.0703242 0.0312462 2.251 0.02443 *
## agea60 0.0694822 0.0321119 2.164 0.03051 *
## agea65 -0.0284007 0.0282282 -1.006 0.31439
## agena 0.1124434 0.0395590 2.842 0.00449 **
## areaz106 0.0789586 0.0435321 1.814 0.06974 .
## areaz110 0.0375241 0.0353641 1.061 0.28868
## areaz114 -0.0111101 0.0371762 -0.299 0.76506
## areaz115 0.0111809 0.0325803 0.343 0.73147
## areaz221 0.0147066 0.0328140 0.448 0.65403
## areazOthers 0.0249228 0.0349567 0.713 0.47589
## areazUnknown 0.0105550 0.0388962 0.271 0.78612
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4216 on 9246 degrees of freedom
## Multiple R-squared: 0.291, Adjusted R-squared: 0.2893
## F-statistic: 172.5 on 22 and 9246 DF, p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)## [1] 0.2909908 0.2575966
load("data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>%
filter(date >= as.Date("2000-12-01")) %>%
mutate(days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
Status = Status(r,f,m,s,K)
) %>% data.frame # 28584
nrow(B)## [1] 28531
B$Buy = predict(glm1, B, type="response")B$Rev = predict(lm1, B)par(mfrow=c(1,2), cex=0.8)
hist(B$Buy)
hist(log(B$Rev,10))par(mfrow=c(1,2), cex=0.8)
hist(B$Buy)
hist(log(B$Rev,10))#篩出目標顧客以做行銷
Target = subset(B, Status=="績優股"|Status == "菜雞")
Target%>%group_by(age)%>%summarise(
Rev = sum(Rev)
)%>%arrange(desc(Rev))## # A tibble: 11 x 2
## age Rev
## <chr> <dbl>
## 1 a35 476791.
## 2 a30 453647.
## 3 a40 367667.
## 4 a45 265742.
## 5 a25 235131.
## 6 a50 163905.
## 7 a20 116004.
## 8 a65 81776.
## 9 a55 78311.
## 10 a60 50466.
## 11 na 37602.
P0=A$ProbRetain
R0=A$PredRevenue m=0.20; a=20; b=15
curve(m*plogis((10/a)*(x-b)), 0, 30, lwd=2, ylim=c(0, 0.25),
main=TeX('$m \\cdot Logis(10(x - b)/a)$'), ylab="f(x)")
abline(h=seq(0,0.2,0.05),v=seq(0,30,5),col='lightgrey',lty=2)library(manipulate)
#使用manipulate套件進行行銷模擬(需複製到R Script)
# manipulate({
# curve(m*plogis((10/a)*(x-b)), 0, 30, lwd=2, ylim=c(0, 0.25),
# main = TeX('$m \\cdot Logis(10(x - b)/a)$'), ylab="f(x)")
# abline(h=seq(0,0.2,0.05),v=seq(0,30,5),col='lightgrey',lty=2)
# },
# m = slider(0.05, 0.25, 0.20, step=0.01),
# a = slider( 10, 30, 20, step=1),
# b = slider( 4, 20, 15, step=1)
# )
#調整變數
#行銷情境1
m=0.2; a=20; b=20
m=0.2; a## [1] 20
do.call(rbind, lapply(seq(5,40,0.5), function(c){
p = m*plogis((10/a)*(c-b))
Target %>% mutate(
PI = ifelse(Buy<=(1-p), p, 1-Buy) * Rev - c
) %>%
group_by(Status) %>% summarise(
Cost = c,
Group.Sz = n(),
No.Target = sum(PI>0),
AvgROI = mean(PI[PI>0]),
TotalROI = sum(PI[PI>0])
) } ) ) %>%
ggplot(aes(x=Cost, y=TotalROI, col=Status)) +
geom_line(size=1.2) +
ggtitle("Cost Effeciency per Segment ")#行銷情境二
m=0.2; a=20; b=11
do.call(rbind, lapply(seq(5,40,0.5), function(c){
p = m*plogis((10/a)*(c-b))
Target %>% mutate(
PI = ifelse(Buy<=(1-p), p, 1-Buy) * Rev - c
) %>%
group_by(Status) %>% summarise(
Cost = c,
Group.Sz = n(),
No.Target = sum(PI>0),
AvgROI = mean(PI[PI>0]),
TotalROI = sum(PI[PI>0])
) } ) ) %>%
ggplot(aes(x=Cost, y=TotalROI, col=Status)) +
geom_line(size=1.2) +
ggtitle("Cost Effeciency per Segment ")